{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "import time\r\n",
    "import pickle\r\n",
    "import tensorflow as tf\r\n",
    "gpus = tf.config.experimental.list_physical_devices('GPU')\r\n",
    "if gpus:\r\n",
    "    # only use GPU memory that we need, not allocate all the GPU memory\r\n",
    "    tf.config.experimental.set_memory_growth(gpus[0], enable=True)\r\n",
    "\r\n",
    "import tqdm\r\n",
    "import numpy as np\r\n",
    "from tensorflow.keras.preprocessing.text import Tokenizer\r\n",
    "from tensorflow.keras.preprocessing.sequence import pad_sequences\r\n",
    "from tensorflow.keras.utils import to_categorical\r\n",
    "from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard\r\n",
    "from sklearn.model_selection import train_test_split\r\n",
    "from tensorflow.keras.layers import Embedding, LSTM, Dropout, Dense\r\n",
    "from tensorflow.keras.models import Sequential\r\n",
    "from tensorflow.keras.metrics import Recall, Precision"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "SEQUENCE_LENGTH = 100 # the length of all sequences (number of words per sample)\r\n",
    "EMBEDDING_SIZE = 100  # Using 100-Dimensional GloVe embedding vectors\r\n",
    "TEST_SIZE = 0.25 # ratio of testing set\r\n",
    "\r\n",
    "BATCH_SIZE = 64\r\n",
    "EPOCHS = 10 # number of epochs\r\n",
    "\r\n",
    "label2int = {\"ham\": 0, \"spam\": 1}\r\n",
    "int2label = {0: \"ham\", 1: \"spam\"}"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "def load_data():\r\n",
    "    \"\"\"\r\n",
    "    Loads SMS Spam Collection dataset\r\n",
    "    \"\"\"\r\n",
    "    texts, labels = [], []\r\n",
    "    with open(\"data/SMSSpamCollection\") as f:\r\n",
    "        for line in f:\r\n",
    "            split = line.split()\r\n",
    "            labels.append(split[0].strip())\r\n",
    "            texts.append(' '.join(split[1:]).strip())\r\n",
    "    return texts, labels"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# load the data\r\n",
    "X, y = load_data()"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# Text tokenization\r\n",
    "# vectorizing text, turning each text into sequence of integers\r\n",
    "tokenizer = Tokenizer()\r\n",
    "tokenizer.fit_on_texts(X)\r\n",
    "# lets dump it to a file, so we can use it in testing\r\n",
    "pickle.dump(tokenizer, open(\"results/tokenizer.pickle\", \"wb\"))\r\n",
    "\r\n",
    "# convert to sequence of integers\r\n",
    "X = tokenizer.texts_to_sequences(X)\r\n",
    "print(X[0])"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# convert to numpy arrays\r\n",
    "X = np.array(X)\r\n",
    "y = np.array(y)\r\n",
    "# pad sequences at the beginning of each sequence with 0's\r\n",
    "# for example if SEQUENCE_LENGTH=4:\r\n",
    "# [[5, 3, 2], [5, 1, 2, 3], [3, 4]]\r\n",
    "# will be transformed to:\r\n",
    "# [[0, 5, 3, 2], [5, 1, 2, 3], [0, 0, 3, 4]]\r\n",
    "X = pad_sequences(X, maxlen=SEQUENCE_LENGTH)\r\n",
    "print(X[0])"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# One Hot encoding labels\r\n",
    "# [spam, ham, spam, ham, ham] will be converted to:\r\n",
    "# [1, 0, 1, 0, 1] and then to:\r\n",
    "# [[0, 1], [1, 0], [0, 1], [1, 0], [0, 1]]\r\n",
    "\r\n",
    "y = [ label2int[label] for label in y ]\r\n",
    "y = to_categorical(y)\r\n",
    "print(y[0])"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# split and shuffle\r\n",
    "X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, random_state=7)\r\n",
    "# print our data shapes\r\n",
    "print(\"X_train.shape:\", X_train.shape)\r\n",
    "print(\"X_test.shape:\", X_test.shape)\r\n",
    "print(\"y_train.shape:\", y_train.shape)\r\n",
    "print(\"y_test.shape:\", y_test.shape)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "def get_embedding_vectors(tokenizer, dim=100):\r\n",
    "    embedding_index = {}\r\n",
    "    with open(f\"data/glove.6B.{dim}d.txt\", encoding='utf8') as f:\r\n",
    "        for line in tqdm.tqdm(f, \"Reading GloVe\"):\r\n",
    "            values = line.split()\r\n",
    "            word = values[0]\r\n",
    "            vectors = np.asarray(values[1:], dtype='float32')\r\n",
    "            embedding_index[word] = vectors\r\n",
    "    word_index = tokenizer.word_index\r\n",
    "    # we do +1 because Tokenizer() starts from 1\r\n",
    "    embedding_matrix = np.zeros((len(word_index)+1, dim))\r\n",
    "    for word, i in word_index.items():\r\n",
    "        embedding_vector = embedding_index.get(word)\r\n",
    "        if embedding_vector is not None:\r\n",
    "            # words not found will be 0s\r\n",
    "            embedding_matrix[i] = embedding_vector\r\n",
    "    return embedding_matrix"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "def get_model(tokenizer, lstm_units):\r\n",
    "    \"\"\"\r\n",
    "    Constructs the model,\r\n",
    "    Embedding vectors => LSTM => 2 output Fully-Connected neurons with softmax activation\r\n",
    "    \"\"\"\r\n",
    "    # get the GloVe embedding vectors\r\n",
    "    embedding_matrix = get_embedding_vectors(tokenizer)\r\n",
    "    model = Sequential()\r\n",
    "    model.add(Embedding(len(tokenizer.word_index)+1,\r\n",
    "              EMBEDDING_SIZE,\r\n",
    "              weights=[embedding_matrix],\r\n",
    "              trainable=False,\r\n",
    "              input_length=SEQUENCE_LENGTH))\r\n",
    "\r\n",
    "    model.add(LSTM(lstm_units, recurrent_dropout=0.2))\r\n",
    "    model.add(Dropout(0.3))\r\n",
    "    model.add(Dense(2, activation=\"softmax\"))\r\n",
    "    # compile as rmsprop optimizer\r\n",
    "    # aswell as with recall metric\r\n",
    "    model.compile(optimizer=\"rmsprop\", loss=\"categorical_crossentropy\",\r\n",
    "                  metrics=[\"accuracy\", Precision(), Recall()])\r\n",
    "    model.summary()\r\n",
    "    return model"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# constructs the model with 128 LSTM units\r\n",
    "model = get_model(tokenizer=tokenizer, lstm_units=128)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# initialize our ModelCheckpoint and TensorBoard callbacks\r\n",
    "# model checkpoint for saving best weights\r\n",
    "model_checkpoint = ModelCheckpoint(\"results/spam_classifier_{val_loss:.2f}.h5\", save_best_only=True,\r\n",
    "                                    verbose=1)\r\n",
    "# for better visualization\r\n",
    "tensorboard = TensorBoard(f\"logs/spam_classifier_{time.time()}\")"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# train the model\r\n",
    "model.fit(X_train, y_train, validation_data=(X_test, y_test),\r\n",
    "          batch_size=BATCH_SIZE, epochs=EPOCHS,\r\n",
    "          callbacks=[tensorboard, model_checkpoint],\r\n",
    "          verbose=1)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "# get the loss and metrics\r\n",
    "result = model.evaluate(X_test, y_test)\r\n",
    "# extract those\r\n",
    "loss = result[0]\r\n",
    "accuracy = result[1]\r\n",
    "precision = result[2]\r\n",
    "recall = result[3]\r\n",
    "\r\n",
    "print(f\"[+] Accuracy: {accuracy*100:.2f}%\")\r\n",
    "print(f\"[+] Precision:   {precision*100:.2f}%\")\r\n",
    "print(f\"[+] Recall:   {recall*100:.2f}%\")"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "def get_predictions(text):\r\n",
    "    sequence = tokenizer.texts_to_sequences([text])\r\n",
    "    # pad the sequence\r\n",
    "    sequence = pad_sequences(sequence, maxlen=SEQUENCE_LENGTH)\r\n",
    "    # get the prediction\r\n",
    "    prediction = model.predict(sequence)[0]\r\n",
    "    # one-hot encoded vector, revert using np.argmax\r\n",
    "    return int2label[np.argmax(prediction)]\r\n",
    "    "
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "text = \"You won a prize of 1,000$, click here to claim!\"\r\n",
    "get_predictions(text)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [
    "text = \"Hi man, I was wondering if we can meet tomorrow\"\r\n",
    "get_predictions(text)"
   ],
   "outputs": [],
   "metadata": {}
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "source": [],
   "outputs": [],
   "metadata": {}
  }
 ],
 "metadata": {
  "orig_nbformat": 4,
  "language_info": {
   "name": "python",
   "version": "3.8.7",
   "mimetype": "text/x-python",
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "pygments_lexer": "ipython3",
   "nbconvert_exporter": "python",
   "file_extension": ".py"
  },
  "kernelspec": {
   "name": "python3",
   "display_name": "Python 3.8.7 64-bit"
  },
  "interpreter": {
   "hash": "777490da48e046e3b512f0b24bf037db286a787493a11bf82a9e0f2cbf21bb67"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}